####################################
# SELECT CASES FOR PROCESS TRACING #
####################################

# Author: Kasia Nalewajko
# First created: 3 October 2021
# Replicated: 17 June 2024

# Note: As noted in the Online Annex (page 59), I use a model from a previous version of the manuscript, which was updated during the later review process. Unlike the manuscript and all additional analyses, this model is run on the province level.

rm(list = ls())

# LOAD PACKAGES -----------------------------------------------------------

if (!require("dplyr")) install.packages("dplyr")
if (!require("fixest")) install.packages("fixest")
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("viridis")) install.packages("viridis")

# LOAD DATA ---------------------------------------------------------------

load("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/main.Rda")
load("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/main_dept.Rda")
load("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/deppct_zones_distances_resregions.Rda")

# Create province-level data ---------------------------------------------

# take most up-to-date county-level data
dept <- main %>% 
  group_by(dept_code) %>% 
  summarise(all_jewish_victims = sum(all_jewish_victims, na.rm = T),
            pop1936 = sum(pop1936, na.rm = T),
            allocated_jpop = sum(allocated_jpop, na.rm = T),
            synagogues = sum(nsynagogues, na.rm = T),
            collabos_antijew = sum(collabos_antijew, na.rm = T),
            DHI_milipol_sum1942 = sum(DHI_milipol_sum1942, na.rm = T),
            catholic_churches = sum(catholic_churches, na.rm = T),
            area_sqkm = sum(area_sqkm, na.rm = T),
            FFIFFCgentile = sum(FFIFFCgentile, na.rm = T),
            mpf_sum = sum(mpf_sum, na.rm = T)) %>% 
  mutate(collabos_antijew1000 = collabos_antijew*1000/pop1936,
         mpf1000 = mpf_sum*1000/pop1936,
         FFIFFC1000gentile = FFIFFCgentile*1000/pop1936)

# add information about La Résistance regions
dept_zones <- deppct_zones %>% 
  dplyr::select(dept_code, resregion2) %>% 
  unique() %>% 
  arrange(dept_code) %>% 
  filter(resregion2 != "divided" & !dept_code %in% c(16,18,33,37,39,40,64,71,86))

missing_dept <- data.frame(dept_code = c(16,18,33,37,39,40,64,71,86), resregion2 = "divided")

dept_zones <- rbind(dept_zones, missing_dept)

dept <- left_join(dept, dept_zones)

# get province names and occupation zones
zone5 <- frdep %>% 
  dplyr::select(dept_code, zone5, dept_name) %>% 
  unique()

dept <- left_join(dept, zone5)

dept <- dept %>% 
  mutate_all(~ifelse(is.nan(.), NA, .))

# use old version of the dataset that includes old province-level 1914 and 1936 elections, selected 1936 census data and information about immigration (the latter two are unavailable on the county level)
temp <- frdep %>%
  unique()

dept <- left_join(dept, temp, by = "dept_code")


# RUN MODELS --------------------------------------------------------------

dept$controls_s1_pred <- NULL

# run (manually) the first-stage of the old model (old specification)
controls_s1 <- lm(log(FFIFFC1000gentile+1) ~ log(mpf1000+1) + log(vote_center_pct_1914+1) + log(vote_RLdiff_pct_1914+1) + log(turnout1914+1) + log(unemployed_total+1) + log(agriculture+1) + log(military+1) + log(industry+1) + log(public_sector+1),
                  data = dept)

# store the predicted values from first stage (controls)
dept <- dept %>%
  modelr::add_predictions(model = controls_s1) %>% 
  dplyr::rename(controls_s1_pred = pred)

# run the second stage (old specification)
full2 <- fixest::feols(log(all_jewish_victims+1) ~ 
                         log(collabos_antijew+1) +
                         log(nsynagogues+1) + 
                         log(DHI_milipol_sum1942+1) + 
                         log(german1000+1) + 
                         log(polish1000+1) + 
                         log(russian1000+1) + 
                         log(czechoslovakian1000+1) + 
                         log(foreigners_sum1000+1) + 
                         log(catholic_churches+1) + 
                         log(turnout1936+1) +
                         log(vote_center_pct1936+1) +
                         log(vote_RLdiff_pct1936+1) +
                         log(rug_sd+1) +
                         frelev_mean +
                         area_sqkm.y
                       | `zone5.x` | log(FFIFFC1000gentile+1) ~ log(mpf1000+1) + log(vote_center_pct_1914+1) + log(vote_RLdiff_pct_1914+1) + log(turnout1914+1) + log(unemployed_total+1) + log(agriculture+1) + log(military+1) + log(industry+1) + log(public_sector+1),
                       cluster = ~resregion2,
                       data = dept
)


obsRemoved <- abs(full2[["obs_selection"]][["obsRemoved"]])
dept <- dept[-obsRemoved,]
temp <- broom::augment(x = full2, data = dept)
temp$fitted <- fitted.values(full2)

highlight_df <- temp %>% 
  filter(`dept_name.x` %in% c("Creuse", "Gironde", "Dordogne", "Vosges", "Doubs", "Somme"))
highlight_df$`dept_name.x` <- factor(highlight_df$`dept_name.x`, levels=c("Creuse", "Gironde", "Dordogne", "Vosges", "Doubs", "Somme"), labels=c("Creuse", "Gironde", "Dordogne", "Vosges", "Doubs", "Somme"))

# PLOT --------------------------------------------------------------

temp %>% 
  ggplot() +
  geom_point(aes(x = controls_s1_pred, y = `.resid`), 
             alpha = 3/10, 
             colour = "grey") +
  geom_hline(aes(yintercept = 0), linetype = 2, color = "gray20") +
  geom_point(data = highlight_df,
             aes(x = controls_s1_pred, y = `.resid`, color = `dept_name.x`),
             size = 3) +
  geom_text(data = temp,
            aes(x = controls_s1_pred, y = `.resid`, label = `dept_name.x`),
            size = 2,
            vjust = 0,
            nudge_y = 0.07,
            show.legend = FALSE) +
  scale_color_viridis_d(option = "inferno", begin = 0.2, end = 0.85) +
  labs(
       x = "Number insurgents per 1,000 inhabitants instrumented by WW1 deaths | covariates",
       y = "Model residuals",
       color = "Province (département)",
       caption = "") +
  theme_bw() +
  theme(legend.position = "bottom")

temp %>% 
  filter(dept_name.x %in% highlight_df$`dept_name.x`) %>% 
  dplyr::select(dept_name.x, `.resid`)

# EXPORT --------------------------------------------------------------

ggsave(
  "G1_case_selection.png",
  plot = last_plot(),
  path = "./00 SUBMITTED/00 APSR final/03 dataverse_online_appendix/figures",
  width = 19,
  height = 13,
  units = "cm",
  dpi = 300
)


